# Chap 2 R Script

# Remove all objects
rm(list = ls(all=TRUE))
# Set a working directory; here we create a folder named CDA in the C drive (C:) first
setwd("C:/CDA")

# The following user-written packages need to be installed first
# Use install.packages(" ") and then load it with library()
# library(dplyr)         # It is already installed for Chapter 1
# library(sjmisc)        # It is already installed for Chapter 1
# library(gmodels)
# library(ggeffects)
# library(pastecs)
# library(stargazer)
# library(sjstats)
# library(vcd)

# Import GSS 2016 Stata data file
library(foreign)
chp2 <- read.dta("C:/CDA/gss2016-chap1.dta")
attach(chp2)
summary(chp2)
head(chp2)
str(age)
summary(age)
mean(age, na.rm=TRUE)
sd(age, na.rm=TRUE)
max(age, na.rm=TRUE)
min(age, na.rm=TRUE)
length(age)

# Descriptive statistics by group: method 1
mean <- tapply(age, sex, mean, na.rm=TRUE)
sd <- tapply(age, sex, sd, na.rm=TRUE)
max <- tapply(age, sex, max, na.rm=TRUE)
min <- tapply(age, sex, min, na.rm=TRUE)
length <- tapply(age, sex, length)
cbind(mean, sd, max, min, length)

# Descriptive statistics by group: method 2
library(dplyr)
gender <- group_by(chp2, sex)
summarize(gender, mean(age,na.rm=TRUE), sd(age,na.rm=TRUE), max(age,na.rm=TRUE),
          min(age,na.rm=TRUE))

# Descriptive statistics by group: method 3
library(sjmisc)
chp2 %>% group_by(sex) %>% descr(age)

# Descriptive statistics for two variables
library(pastecs)
stat.desc(chp2[, c("age","educ")])

# Descriptive statistics for two variables: method 2
#library(sjmisc)
descr(chp2, age, educ)

# Descriptive statistics for multiple variables by group
chp2 %>% group_by(sex) %>% select(age, educ) %>% descr()

# Frequency table for a factor or categorical variable
str(degree)
table(degree)
table(marital)
# Frequency table with frq() in sjmisc
#library(sjmisc)
frq(chp2, degree)
frq(chp2, marital)

# Cross-tabulation
tab <- table(degree,race)
summary(tab)
tab
ftable(tab)
addmargins(tab)
prop.table(tab)

# Cross-tabulation with CrossTable() in gmodels
library(gmodels)
CrossTable(degree, race, digits=2)
chisq.test(degree, race)

# Simple linear regression
realinc1 <- realinc/10000
realrinc1 <- realrinc/10000
slm <- lm(realinc1 ~ realrinc1)
summary(slm)
anova(slm)
coef(slm)
confint(slm)
library(sjstats)
eta_sq(slm)

# Multiple linear regression
mlm <- lm(realinc1 ~ realrinc1 + educ + age, data=chp2)
summary(mlm)
anova(mlm)
coef(mlm)
confint(mlm)
eta_sq(mlm, ci.lvl = .95)
# Another option: effectsize::eta_squared(mlm, partial=FALSE)
eta_sq(mlm, partial=TRUE, ci.lvl = .95)
# More effect size measures
anova_stats(mlm, digits = 3)

# Predicted values with ggpredict() in ggeffects
library(ggeffects)
mlm.educ <- ggpredict(mlm, terms="educ[12, 14, 16]")
mlm.educ
as.data.frame(mlm.educ)
sqrt(diag(vcov(mlm.educ)))
plot(mlm.educ)

mlm.m <- ggpredict(mlm, terms=c("educ[meansd]", "realrinc1[meansd]", "age[meansd]"))
mlm.m

# Create a results table with stargazer()
library(stargazer)
stargazer(slm, mlm, type="text", align=TRUE, out="chp2.lrmod.txt")
stargazer(slm, mlm, type="html", align=TRUE, out="chp2.lrmod.htm")

# Cross-tabulation with CrossTable() in gmodels
library(gmodels)
CrossTable(health, marital, digits=2)
chisq.test(health, marital)

# Recode user-defined missing values to NA
new <- chp2 %>%
  dplyr::select(health, marital) %>%
  dplyr::mutate(health_re = rec(health, rec = "iap=NA; dk=NA; na=NA; else=copy"), 
                marital_re = rec(marital, rec = "na=NA; else=copy" ) )

table(new$health_re, new$marital_re)

# Compute chi-square test statistic and Cramer's V
library(sjstats)
xtab_statistics(new, health_re, marital_re)
cramer(health_re ~ marital_re, data=new)

library(vcd)
tab <- table(new$health_re, new$marital_re)
summary(assocstats(tab))

# Compute the Cramer's V
sqrt(49.423/1873/3)

# Create a matrix and conduct a chi-square test
list=matrix(c(196, 35, 160, 46, 430, 76, 33, 13), nrow=4, byrow=T)
chisq.test(list)

detach(chp2)